###################################################
### Project: Issue Competition in Parliamentary ###
###          Speeches?                          ###
### Title:   Validation                         ###
### Author:  Christoph Ivanusch                 ###
###################################################

# Preparation

## clear global environment
rm(list = ls())

## load packages
library(caret)
library(e1071)
library(quanteda)
library(newsmap)
library(dplyr)

## load hand-coded validation file
validation <- readRDS("Validation_Data.RDS")

## load seed-Word dictionary
seedword_dict <- dictionary(list(Greeting = c("geehrt*", "Hohes Haus", "Kolleg*", "Herren", "Damen", "Geschätzter"),
                                 Labor_Welfare = c("Beschäftigung*", "Arbeit*", "Beruf*", "Pension*", "Gehalt*", "Lohn*","Gewerkschaft*", "Konsumentensch*", "Produktsicherheit*", "Armut*", "Wohlfahrt*", "sozial*", "Einkommen", "Sozial*", "Mindestsicherung*"),
                                 EU_ForeignAffairs = c("EU*", "Europäische* Union", "Europapolitik", "Mitgliedsstaat*", "EZB*", "Brüssel*", "Kommission*", "Außenpolit*", "außenpolit*", "Diplomat*", "UN", "Entwicklungs*"),
                                 Housing = c("Bau", "Wohnbau*", "Miete*", "Wohnung*", "Sanierung*", "Hausbau*","Häuser*", "Immobilie*"),
                                 Finances = c("Finanz*", "Budget*", "Kredit*", "Schulden*", "Bank*", "Ausgabe*", "Einsparung*", "Steuer*", "Rechnungshof*"),
                                 Family_Youth = c("Famili*", "Kind*", "Eltern*", "Geschwist*", "Jugend*"),
                                 Science = c("Wissenschaft*", "Forsch*", "Universitä*","Labor*"),
                                 Education = c("Bildung*","Unterricht*","Schul*", "Schüler*", "Studen*", "Universitä*", "Kindergart*", "Lehr*", "Ausbildung*", "Studierend*", "Studiengebühr*"),
                                 Health = c("Gesund*", "Krank*", "Spital*", "Medizin*", "medizin*", "Patient*", "Rett*", "Arznei*", "Apothek*", "Arzt", "Ärzt*", "Pflege*"),
                                 Women = c("Frauen*", "Gleichbehandlung*", "Gleichstell*", "Gender*", "Ungleichbehandlung*", "Diskrimin*"),
                                 Security_Crime = c("Krimin*", "Sicherheit*", "Straf*", "Polizei*", "Mord*", "Gewalt*", "Waffe*", "Dieb*", "Exekutive", "Polizist*"),
                                 JudicialSystem = c("Justiz*", "Gericht*", "Richter*", "Staatsanwalt*", "Strafprozess*", "Gefängnis*", "Haft*", "Verfassung*", "Volksanwalt*"),
                                 Parliament = c("Ausschuss*", "U-Ausschuss*", "Geschäftsord*", "Ordnungsruf*", "Tagesordnung*", "parlament*", "Parlament*", "Untersuchungsausschuss"),
                                 Culture = c("Kunst*", "Künst*", "Kultur*", "künstler*", "ORF", "Film*", "Theat*", "Schauspiele*", "aufführ*", "Musik*", "Galerie*"),
                                 Defense = c("Verteidig*", "Heer*", "Bundesheer*", "Mili*", "Soldat*", "Waffe*", "Luftwaff*", "Trupp*", "Grundwehr*", "Präsenzdien*"),
                                 Agriculture = c("Landwirt*", "Bauer*", "Forst*", "Wald*", "Agrar*", "Lebensmittel*", "Pflanzen*", "Bäuerin*"),
                                 CivilRights = c("Menschenrecht*", "Bürgerrecht*", "Meinungsfreiheit*", "Pressefreiheit*", "Versammlungsfreiheit*", "Informationsfreiheit*", "Wahlrecht*", "Grundfreiheit*", "Grundrecht*"),
                                 Sports = c("Sport*", "Olympia*", "Training*", "trainier*"),
                                 Economy_Energy = c("Wirtschaft*", "wirtschaftl*", "Industrie", "Tourismus*", "touristisch*", "BIP*", "Inflation*", "Energie*", "Gas*", "Unternehme*", "Betrieb*", "wirtschaftlich*", "Firma", "Firmen*"),
                                 Environment = c("Umwelt*", "Natur*", "Klima*", "Luftver*", "Feinstaub*", "Emission*", "Treibhaus*", "Nachhaltig*", "ökolog*"),
                                 Immigration = c("Migra*", "Integration*", "integrier*","Flüchtling*", "Ausländ*","Asyl*", "multikult*"),
                                 Transportation = c("Infrastruktur*", "Verkehr*", "Transport*", "Transit*", "ÖBB", "ASFINAG", "Straße*", "Fahrzeug*", "PKW*", "LKW*", "Züge*", "Bahn*", "Auto*", "Flug*")))

## get issue categories
issues <- names(seedword_dict)
issues <- c(issues, "NA")

## load classification data
df_classification <- readRDS("Classification_Data.RDS")
df_classification$speaker <- trimws(df_classification$speaker)

## merge validation and classification files
df <- right_join(df_classification, validation,
                 by = c("docname_", "docid_", "segid_", "gp", "date", "speaker", "party"))
df <- df[order(df$date),]

df$text.y <- NULL

df$pred[is.na(df$pred)] <- "NA"
df$pred_smoothed[is.na(df$pred_smoothed)] <- "NA"

# Validation Statistics

## preparation
pred_unfactored <- df$pred_smoothed
pred <- factor(pred_unfactored, levels = issues)

handcoded_unfactored <- df$Coded_Issue
handcoded <- factor(handcoded_unfactored, levels = issues)

## create confusion matrix
confusion_matrix <- confusionMatrix(pred, handcoded)

## get accuracy and kappa
stats_validation <- confusion_matrix$overall
round(stats_validation, digits = 2)[1:2]

## get f1-score (micro-average f1-score)
acc <- accuracy(pred_unfactored, handcoded_unfactored)

### formula: 2 * (p*r)/(p+r)
2 * (round(summary(acc), digits = 2)[[1]] * round(summary(acc), digits = 2)[[2]]) /
  (round(summary(acc), digits = 2)[[1]] + round(summary(acc), digits = 2)[[2]])
